Let’s start by reading in the four datasets from the Johns Hopkins
CSSE COVID-19 repository. We’ll use read_csv() to load each
dataset into separate variables with descriptive names.
# Base URL for the COVID-19 data repository
base_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
# Read in all four datasets with descriptive variable names
global_cases <- read_csv(paste0(base_url, "time_series_covid19_confirmed_global.csv"))
global_deaths <- read_csv(paste0(base_url, "time_series_covid19_deaths_global.csv"))
us_cases <- read_csv(paste0(base_url, "time_series_covid19_confirmed_US.csv"))
us_deaths <- read_csv(paste0(base_url, "time_series_covid19_deaths_US.csv"))
# Display basic information about what we just loaded
cat("Dataset dimensions:\n")
## Dataset dimensions:
cat("Global Cases:", nrow(global_cases), "rows x", ncol(global_cases), "columns\n")
## Global Cases: 289 rows x 1147 columns
cat("Global Deaths:", nrow(global_deaths), "rows x", ncol(global_deaths), "columns\n")
## Global Deaths: 289 rows x 1147 columns
cat("US Cases:", nrow(us_cases), "rows x", ncol(us_cases), "columns\n")
## US Cases: 3342 rows x 1154 columns
cat("US Deaths:", nrow(us_deaths), "rows x", ncol(us_deaths), "columns\n")
## US Deaths: 3342 rows x 1155 columns
Before we start cleaning, let’s look at what’s in these datasets. This is always the first step in any data analysis!
# Let's look at the global cases dataset first
cat("Global Cases - First few columns:\n")
## Global Cases - First few columns:
global_cases %>%
select(1:8) %>% # Just show first 8 columns to see the structure
head(3) %>%
kable()
| Province/State | Country/Region | Lat | Long | 1/22/20 | 1/23/20 | 1/24/20 | 1/25/20 |
|---|---|---|---|---|---|---|---|
| NA | Afghanistan | 33.93911 | 67.70995 | 0 | 0 | 0 | 0 |
| NA | Albania | 41.15330 | 20.16830 | 0 | 0 | 0 | 0 |
| NA | Algeria | 28.03390 | 1.65960 | 0 | 0 | 0 | 0 |
cat("\nColumn names in global_cases (first 10):\n")
##
## Column names in global_cases (first 10):
names(global_cases)[1:10]
## [1] "Province/State" "Country/Region" "Lat" "Long"
## [5] "1/22/20" "1/23/20" "1/24/20" "1/25/20"
## [9] "1/26/20" "1/27/20"
# Let's see what the date columns look like
date_columns <- names(global_cases)[5:10] # Show first few date columns
cat("\nSample date column names:\n")
##
## Sample date column names:
print(date_columns)
## [1] "1/22/20" "1/23/20" "1/24/20" "1/25/20" "1/26/20" "1/27/20"
Now let’s clean up the data. We want to make it tidy, which means: - Each observation (date-location combination) gets its own row - Remove unnecessary columns (Lat/Long) - Convert date columns to proper date format
# Start with global_cases and make it tidy
global_cases_clean <- global_cases %>%
# Step 1: Remove Lat and Long columns (we don't need them)
select(-Lat, -Long) %>%
# Step 2: Make column names more R-friendly
rename(
province_state = `Province/State`,
country_region = `Country/Region`
) %>%
# Step 3: Pivot longer - convert wide format to long format
# Everything except province_state and country_region should become rows
pivot_longer(
cols = -c(province_state, country_region),
names_to = "date",
values_to = "cases"
) %>%
# Step 4: Convert date column to proper Date format
mutate(date = mdy(date)) %>% # mdy = month/day/year format
# Step 5: Arrange by date for better organization
arrange(date)
# Let's see what we accomplished
cat("Cleaned Global Cases Data Structure:\n")
## Cleaned Global Cases Data Structure:
global_cases_clean %>%
head(10) %>%
kable()
| province_state | country_region | date | cases |
|---|---|---|---|
| NA | Afghanistan | 2020-01-22 | 0 |
| NA | Albania | 2020-01-22 | 0 |
| NA | Algeria | 2020-01-22 | 0 |
| NA | Andorra | 2020-01-22 | 0 |
| NA | Angola | 2020-01-22 | 0 |
| NA | Antarctica | 2020-01-22 | 0 |
| NA | Antigua and Barbuda | 2020-01-22 | 0 |
| NA | Argentina | 2020-01-22 | 0 |
| NA | Armenia | 2020-01-22 | 0 |
| Australian Capital Territory | Australia | 2020-01-22 | 0 |
cat(paste("\nWe went from", ncol(global_cases), "columns to", ncol(global_cases_clean), "columns"))
##
## We went from 1147 columns to 4 columns
cat(paste("\nWe went from", nrow(global_cases), "rows to", nrow(global_cases_clean), "rows"))
##
## We went from 289 rows to 330327 rows
# Do the same thing for global deaths
global_deaths_clean <- global_deaths %>%
select(-Lat, -Long) %>%
rename(
province_state = `Province/State`,
country_region = `Country/Region`
) %>%
pivot_longer(
cols = -c(province_state, country_region),
names_to = "date",
values_to = "deaths"
) %>%
mutate(date = mdy(date)) %>%
arrange(date)
cat("Sample of cleaned Global Deaths data:\n")
## Sample of cleaned Global Deaths data:
global_deaths_clean %>%
head(5) %>%
kable()
| province_state | country_region | date | deaths |
|---|---|---|---|
| NA | Afghanistan | 2020-01-22 | 0 |
| NA | Albania | 2020-01-22 | 0 |
| NA | Algeria | 2020-01-22 | 0 |
| NA | Andorra | 2020-01-22 | 0 |
| NA | Angola | 2020-01-22 | 0 |
The US data has more columns, but we’ll follow the same process:
# Clean US cases data
us_cases_clean <- us_cases %>%
# US data has more columns - let's keep the important ones
select(-UID, -iso2, -iso3, -code3, -FIPS, -Lat, -Long_) %>%
rename(
province_state = Province_State,
country_region = Country_Region,
admin2 = Admin2,
combined_key = Combined_Key
) %>%
pivot_longer(
cols = -c(province_state, country_region, admin2, combined_key),
names_to = "date",
values_to = "cases"
) %>%
mutate(date = mdy(date)) %>%
arrange(date)
# Clean US deaths data
us_deaths_clean <- us_deaths %>%
select(-UID, -iso2, -iso3, -code3, -FIPS, -Lat, -Long_) %>%
rename(
province_state = Province_State,
country_region = Country_Region,
admin2 = Admin2,
combined_key = Combined_Key,
population = Population
) %>%
pivot_longer(
cols = -c(province_state, country_region, admin2, combined_key, population),
names_to = "date",
values_to = "deaths"
) %>%
mutate(date = mdy(date)) %>%
arrange(date)
cat("US Cases - now in tidy format:\n")
## US Cases - now in tidy format:
us_cases_clean %>%
head(5) %>%
kable()
| admin2 | province_state | country_region | combined_key | date | cases |
|---|---|---|---|---|---|
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-22 | 0 |
| Baldwin | Alabama | US | Baldwin, Alabama, US | 2020-01-22 | 0 |
| Barbour | Alabama | US | Barbour, Alabama, US | 2020-01-22 | 0 |
| Bibb | Alabama | US | Bibb, Alabama, US | 2020-01-22 | 0 |
| Blount | Alabama | US | Blount, Alabama, US | 2020-01-22 | 0 |
Let’s verify our data cleaning worked by checking Alabama data (as we did earlier):
# Check Alabama data from March 2020 to verify our cleaning worked
alabama_march <- us_cases_clean %>%
filter(province_state == "Alabama",
date >= as.Date("2020-03-01"),
date <= as.Date("2020-03-31")) %>%
group_by(date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') %>%
filter(total_cases > 0)
cat("Alabama's first COVID-19 cases in March 2020:\n")
## Alabama's first COVID-19 cases in March 2020:
alabama_march %>%
head(10) %>%
kable()
| date | total_cases |
|---|---|
| 2020-03-11 | 3 |
| 2020-03-12 | 4 |
| 2020-03-13 | 8 |
| 2020-03-14 | 15 |
| 2020-03-15 | 28 |
| 2020-03-16 | 36 |
| 2020-03-17 | 51 |
| 2020-03-18 | 61 |
| 2020-03-19 | 88 |
| 2020-03-20 | 115 |
# Let's also check why we had zeros in January
alabama_january <- us_cases_clean %>%
filter(province_state == "Alabama",
date == as.Date("2020-01-22")) %>%
group_by(date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop')
cat(paste("\nAlabama cases on January 22, 2020:", alabama_january$total_cases))
##
## Alabama cases on January 22, 2020: 0
cat("\nThis makes sense - Alabama's first case was reported in March!")
##
## This makes sense - Alabama's first case was reported in March!
Now that we have clean, tidy data, let’s create some summary statistics:
# Calculate global totals by date
global_summary <- global_cases_clean %>%
group_by(date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop')
global_deaths_summary <- global_deaths_clean %>%
group_by(date) %>%
summarise(total_deaths = sum(deaths, na.rm = TRUE), .groups = 'drop')
# Combine cases and deaths
global_combined <- global_summary %>%
left_join(global_deaths_summary, by = "date") %>%
mutate(case_fatality_rate = round((total_deaths / total_cases) * 100, 2))
# Show latest global numbers
latest_global <- global_combined %>%
filter(date == max(date))
cat("Latest Global COVID-19 Numbers:\n")
## Latest Global COVID-19 Numbers:
kable(latest_global,
col.names = c("Date", "Total Cases", "Total Deaths", "CFR (%)"),
format.args = list(big.mark = ","))
| Date | Total Cases | Total Deaths | CFR (%) |
|---|---|---|---|
| 2023-03-09 | 676,570,149 | 6,881,802 | 1.02 |
# US state-level summary for the most recent date
us_state_summary <- us_cases_clean %>%
filter(date == max(date)) %>%
group_by(province_state) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(total_cases)) %>%
head(15)
us_deaths_summary <- us_deaths_clean %>%
filter(date == max(date)) %>%
group_by(province_state) %>%
summarise(total_deaths = sum(deaths, na.rm = TRUE), .groups = 'drop')
# Combine cases and deaths
us_combined <- us_state_summary %>%
left_join(us_deaths_summary, by = "province_state") %>%
mutate(case_fatality_rate = round((total_deaths / total_cases) * 100, 2))
cat("Top 15 US States by Total Cases:\n")
## Top 15 US States by Total Cases:
kable(us_combined,
col.names = c("State", "Total Cases", "Total Deaths", "CFR (%)"),
format.args = list(big.mark = ","))
| State | Total Cases | Total Deaths | CFR (%) |
|---|---|---|---|
| California | 12,129,699 | 101,159 | 0.83 |
| Texas | 8,466,220 | 93,390 | 1.10 |
| Florida | 7,574,590 | 86,850 | 1.15 |
| New York | 6,794,738 | 77,157 | 1.14 |
| Illinois | 4,083,292 | 41,496 | 1.02 |
| Pennsylvania | 3,527,854 | 50,398 | 1.43 |
| North Carolina | 3,472,644 | 28,432 | 0.82 |
| Ohio | 3,400,652 | 41,796 | 1.23 |
| Georgia | 3,068,208 | 42,489 | 1.38 |
| Michigan | 3,064,125 | 42,205 | 1.38 |
| New Jersey | 3,048,984 | 36,015 | 1.18 |
| Tennessee | 2,515,130 | 29,263 | 1.16 |
| Arizona | 2,443,514 | 33,102 | 1.35 |
| Virginia | 2,291,951 | 23,666 | 1.03 |
| Massachusetts | 2,224,337 | 24,333 | 1.09 |
Now let’s create some visualizations with our clean, tidy data:
# Plot global cases over time
p1 <- ggplot(global_combined, aes(x = date, y = total_cases)) +
geom_line(color = "steelblue", size = 1.2) +
scale_y_continuous(labels = comma_format()) +
scale_x_date(date_breaks = "3 months", date_labels = "%Y-%m") +
labs(
title = "Global COVID-19 Confirmed Cases Over Time",
subtitle = "Cumulative cases from Johns Hopkins CSSE data",
x = "Date",
y = "Total Confirmed Cases"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot global deaths over time
p2 <- ggplot(global_combined, aes(x = date, y = total_deaths)) +
geom_line(color = "red", size = 1.2) +
scale_y_continuous(labels = comma_format()) +
scale_x_date(date_breaks = "3 months", date_labels = "%Y-%m") +
labs(
title = "Global COVID-19 Deaths Over Time",
subtitle = "Cumulative deaths from Johns Hopkins CSSE data",
x = "Date",
y = "Total Deaths"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p1)
print(p2)
# Find top 10 countries by latest case count
top_countries <- global_cases_clean %>%
filter(date == max(date)) %>%
group_by(country_region) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(total_cases)) %>%
head(10) %>%
pull(country_region)
# Create trend data for top countries
top_countries_trends <- global_cases_clean %>%
filter(country_region %in% top_countries) %>%
group_by(country_region, date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop')
# Plot trends
p3 <- ggplot(top_countries_trends, aes(x = date, y = total_cases, color = country_region)) +
geom_line(size = 1.1) +
scale_y_continuous(labels = comma_format()) +
scale_x_date(date_breaks = "4 months", date_labels = "%Y-%m") +
labs(
title = "COVID-19 Confirmed Cases: Top 10 Countries",
subtitle = "Countries ranked by total cumulative cases",
x = "Date",
y = "Confirmed Cases",
color = "Country"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
) +
guides(color = guide_legend(nrow = 2))
print(p3)
# Find top 10 US states
top_states <- us_cases_clean %>%
filter(date == max(date)) %>%
group_by(province_state) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(total_cases)) %>%
head(10) %>%
pull(province_state)
# Create trend data for top states
top_states_trends <- us_cases_clean %>%
filter(province_state %in% top_states) %>%
group_by(province_state, date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop')
# Plot trends
p4 <- ggplot(top_states_trends, aes(x = date, y = total_cases, color = province_state)) +
geom_line(size = 1.1) +
scale_y_continuous(labels = comma_format()) +
scale_x_date(date_breaks = "4 months", date_labels = "%Y-%m") +
labs(
title = "COVID-19 Confirmed Cases: Top 10 US States",
subtitle = "States ranked by total cumulative cases",
x = "Date",
y = "Confirmed Cases",
color = "State"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
) +
guides(color = guide_legend(nrow = 2))
print(p4)
Let’s create some interactive plots for better exploration:
# Interactive plot for global data
interactive_global <- plot_ly(global_combined, x = ~date, y = ~total_cases,
type = 'scatter', mode = 'lines',
name = 'Confirmed Cases',
line = list(color = 'steelblue'),
hovertemplate = paste("Date: %{x}<br>",
"Cases: %{y:,}<br>",
"<extra></extra>")) %>%
add_trace(y = ~total_deaths, name = 'Deaths',
line = list(color = 'red'),
yaxis = 'y2',
hovertemplate = paste("Date: %{x}<br>",
"Deaths: %{y:,}<br>",
"<extra></extra>")) %>%
layout(
title = "Global COVID-19: Cases and Deaths Over Time",
xaxis = list(title = "Date"),
yaxis = list(title = "Confirmed Cases", side = "left"),
yaxis2 = list(title = "Deaths", side = "right", overlaying = "y"),
hovermode = 'x unified',
legend = list(x = 0.02, y = 0.98)
)
interactive_global
# Create a comprehensive country table
country_table <- global_cases_clean %>%
filter(date == max(date)) %>%
group_by(country_region) %>%
summarise(confirmed_cases = sum(cases, na.rm = TRUE), .groups = 'drop') %>%
left_join(
global_deaths_clean %>%
filter(date == max(date)) %>%
group_by(country_region) %>%
summarise(deaths = sum(deaths, na.rm = TRUE), .groups = 'drop'),
by = "country_region"
) %>%
mutate(
case_fatality_rate = round((deaths / confirmed_cases) * 100, 2),
case_fatality_rate = ifelse(is.infinite(case_fatality_rate) | is.nan(case_fatality_rate), 0, case_fatality_rate)
) %>%
arrange(desc(confirmed_cases))
datatable(country_table,
colnames = c("Country/Region", "Confirmed Cases", "Deaths", "CFR (%)"),
caption = "COVID-19 Statistics by Country (Latest Date)",
options = list(
pageLength = 15,
scrollX = TRUE,
searchHighlight = TRUE
)) %>%
formatRound(columns = c("case_fatality_rate"), digits = 2) %>%
formatCurrency(columns = c("confirmed_cases", "deaths"),
currency = "", interval = 3, mark = ",", digits = 0)
read_csv()pivot_longer()read_csv(): Load dataselect(): Choose/remove columnsrename(): Make column names R-friendlypivot_longer(): Wide to long format conversion ⭐mutate(): Create new variablesgroup_by() + summarise(): Calculate
summariesfilter(): Subset dataarrange(): Sort dataThe original data had one column per date (wide format). After cleaning: - Each row = one observation (location + date combination) - Each column = one variable - Much easier to analyze, filter, and visualize!
pivot_longer()
parametersData Source: Johns Hopkins University Center for
Systems Science and Engineering (JHU CSSE)
Last Updated: 2025-10-02